# -*- coding: utf-8 -*-
"""
Analisis de la producción cientifica a traves del tiempo
"""
%%HTML
<script src="require.js"></script>
# Importar librerías
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.linear_model import LinearRegression
pio.renderers.default='notebook'
from google.colab import drive
drive.mount('/content/drive')
Cluster.infoInstituciones.csv https://drive.google.com/drive/u/0/folders/1APVnfD1zPgTHFAr0YMG6UMTwZ52MQVa9
Cluster.cluster.csv https://drive.google.com/drive/u/0/folders/1APVnfD1zPgTHFAr0YMG6UMTwZ52MQVa9
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/3BIO-Cluster-DataAnalysis
%ls
#Data
df = pd.read_csv("Cluster.infoInstituciones.csv")#encoding="latin-1")
df
df.year=df.year.astype(int)
df
df.columns
dfi = pd.read_csv("Cluster.cluster.csv") #encoding="latin-1")
dfi
dfi.columns
clusters = dfi["Cluster"]
dfm = df.join(clusters) #append 'Cluster' to 'df'
dfm
dfm.year=dfm.year.astype(int)
dfm
dfm.columns
pd.plotting.bootstrap_plot(dfm['year'])
fig = px.scatter(dfm, x="year", y="articulos", color="Cluster", facet_col="Cluster")
fig.show()
fig4 = px.box(dfm,y='year',x='Cluster', color='Cluster')
fig4
fig4 = px.scatter(dfm,y='year',x='Cluster', color='Instituciones')
fig4
fig4 = px.scatter(dfm,x='year',y='Instituciones', color='Cluster')
fig4
# all(round(dfm.year) != df.year)
## g = round(dfm.year) != dfm.year
## g.describe()
### g.value_counts('True')
df0 = dfm[dfm.Cluster == 0]
df0
fig = px.box(df0,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Cluster")
fig.show()
fig = px.box(df0,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Instituciones")
fig.show()
fig = px.scatter(df0, x="year", y="articulos", color="Instituciones", size="articulos")
fig.show()
df1 = dfm[dfm.Cluster == 1]
df1
fig = px.box(df1,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Cluster")
fig.show()
fig = px.box(df1,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Instituciones")
fig.show()
fig = px.scatter(df1, x="year", y="articulos", color="Instituciones", size="articulos")
fig.show()
df2 = dfm[dfm.Cluster == 2]
df2
fig = px.box(df2,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Cluster")
fig.show()
fig = px.box(df2,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Instituciones")
fig.show()
fig = px.scatter(df2, x="year", y="articulos", color="Instituciones", size="articulos")
fig.show()
df3 = dfm[dfm.Cluster == 3]
df3
fig = px.box(df3,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Cluster")
fig.show()
fig = px.box(df3,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Instituciones")
fig.show()
fig = px.scatter(df3, x="year", y="articulos", color="Instituciones", size="articulos")
fig.show()
df4 = dfm[dfm.Cluster == 4]
df4
fig = px.box(df4,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Cluster")
fig.show()
fig = px.box(df4,x='year', y=['articulos','capitulos','trabajos_grado','innovaciones', 'libros', 'softwares'], points="all", color="Instituciones")
fig.show()
fig = px.scatter(df4, x="year", y="articulos", color="Instituciones", size="articulos")
fig.show()
dfy = dfm.year.value_counts(ascending=False).reset_index()
dfgy=dfy.round({"index":0, "year":2})
dfgy.columns =['Year', 'Cantidad']
dfgy.Year.value_counts(ascending=False).reset_index().unique=True
dfgy.sort_values('Year')
fig4 = px.scatter(dfgy,y ='Cantidad',x='Year')
fig4
dfgy = dfgy.sort_values(by=['Year'], ascending=False)
fgy = sns.pairplot(dfgy)
dfgy.columns
# Cantidad años
conteo_años = dfgy.groupby('Year').size()
conteo_años.values
años = dfgy.groupby('Year').sum().reset_index()[['Year','Cantidad']]
años
fig = px.bar(años, x="Year", y ='Cantidad', title="Años",text_auto=True)
fig.show()
fig4 = px.scatter(años,y ='Cantidad',x='Year', marginal_x='histogram',marginal_y='violin', trendline='ols')
fig4
años['log_cant'] = np.log(años.Cantidad)
fig4 = px.scatter(años,y ='log_cant',x='Year', marginal_x='histogram',marginal_y='violin', trendline='ols')
fig4
fig = px.scatter(años, x="Year", y="log_cant", size="log_cant")
fig.show()
yy = sns.pairplot(años)
fig = px.line(años, x="Year", y="log_cant")
fig.show()
fig = px.scatter(años, x="Year", y="Cantidad", marginal_y="box")
fig.show()
años
años.index
np.log(años.index)
fig4 = px.line(años,y ='log_cant',x='Year')
fig4
Cluster.infoGrupos.csv https://drive.google.com/drive/u/0/folders/1APVnfD1zPgTHFAr0YMG6UMTwZ52MQVa9
dfg = pd.read_csv("Cluster.infoGrupos.csv")
dfg
dfg.columns
dfg.T
dfg ['year'].plot()
pd.plotting.bootstrap_plot(dfg['year'])
dfgy1 = dfg.year.value_counts(ascending=False).reset_index()
#dfgy1=dfgy1.round({"index":0, "year":2})
dfgy1.columns =['Year', 'Cantidad']
dfgy1.Year.value_counts(ascending=False).reset_index().unique=True
dfgy1.sort_values('Year')
fig4 = px.scatter(dfgy1,y ='Cantidad',x='Year')
fig4
años1 = dfgy1.groupby('Year').sum().reset_index()[['Year','Cantidad']]
años1
fig = px.bar(años1, x="Year", y ='Cantidad', title="Años",text_auto=True)
fig.show()
años1['log_cant'] = np.log(años1.Cantidad)
fig4 = px.scatter(años1,y ='log_cant',x='Year', marginal_x='histogram',marginal_y='violin', trendline='ols')
fig4
#articulos = pd.read_csv('/Users/jmlz_rp/Documents/BIOS/BIOS_rar/Datos/Datos sin procesar/Categorias minciencias/articulos.csv')
articulos = pd.read_csv('articulos.csv')
articulos
articulos.columns
a = articulos.anio.sort_values(ascending=True)
a.tail()
a.head()
a.describe()
a.shape
fig = px.scatter(articulos,x='anio', y='grupo')
fig.update_xaxes(range=[1900,2025])
fig